Data Preparation and Exploration

# load dataset
airquality <- read.csv('SD_airquality_2016-2020.csv')
# data summary
summary(airquality)
##        X              Date           PM2.5.AQI.Value   Site.Name        
##  Min.   :   1.0   Length:1827        Min.   : 12.00   Length:1827       
##  1st Qu.: 457.5   Class :character   1st Qu.: 48.00   Class :character  
##  Median : 914.0   Mode  :character   Median : 58.00   Mode  :character  
##  Mean   : 914.0                      Mean   : 58.35                     
##  3rd Qu.:1370.5                      3rd Qu.: 67.00                     
##  Max.   :1827.0                      Max.   :172.00                     
##    Site.ID             Source          X20.year.High..2000.2019.
##  Length:1827        Length:1827        Min.   : 61.00           
##  Class :character   Class :character   1st Qu.: 75.00           
##  Mode  :character   Mode  :character   Median : 86.00           
##                                        Mean   : 93.38           
##                                        3rd Qu.:106.00           
##                                        Max.   :289.00           
##  X20.year.Low..2000.2019. X5.year.Average..2015.2019. Date.of.20.year.High
##  Min.   : 0.00            Length:1827                 Length:1827         
##  1st Qu.:22.00            Class :character            Class :character    
##  Median :27.00            Mode  :character            Mode  :character    
##  Mean   :27.85                                                            
##  3rd Qu.:33.00                                                            
##  Max.   :54.00                                                            
##  Date.of.20.year.Low
##  Length:1827        
##  Class :character   
##  Mode  :character   
##                     
##                     
## 
# data visualizations (convert Date into date data type first)
airquality$Date <- as.Date(airquality$Date, format="%m/%d/%Y")

# looking at PM2.5 across all five years
firstfigure <- airquality %>%
  ggplot( aes(x=Date, y=PM2.5.AQI.Value)) +
  geom_area(fill='darkblue', alpha=0.5) +
  geom_line(color='darkblue') +
  ylab('PM 2.5 AQI Value') + 
  ggtitle('PM 2.5 AQI Data from 2016-2020') +
  theme(plot.title=element_text(size=14, face="bold"), 
        axis.title.y=element_text(size=12, face="bold"),
        axis.title.x=element_text(size=12, face="bold"))

firstfigure <- ggplotly(firstfigure)
firstfigure

OBSERVATIONS: - quick observations show that many peak PM2.5 readings occur in Q4 of the year - interestingly, some of the lowest readings occur around MAR/APR - sharp decrease in PM2.5 readings in MAR 2020 (possibly related to COVID lockdowns?) - max PM2.5 reading from SEP-OCT 2020 (coinciding with fires)

# add additional columns for year and day of the year
airquality <- transform(airquality,
                        Year = format(Date, '%Y'),
                        DayOfYear = as.numeric(format(Date, '%j')))
#year comparisons of PM2.5
secondfigure <- airquality %>%
  ggplot( aes(x=DayOfYear, y=PM2.5.AQI.Value, group=Year, colour=Year)) +
  geom_point() + 
  geom_line() +
  ylab('PM 2.5 AQI Value') + 
  ggtitle('PM 2.5 AQI Data - Year Comparisons') +
  theme(plot.title=element_text(size=14, face="bold"), 
        axis.title.y=element_text(size=12, face="bold"),
        axis.title.x=element_text(size=12, face="bold"))

secondfigure <- ggplotly(secondfigure)
secondfigure

OBSERVATIONS: - 2016 has erratic readings during Q1, including both highest and lowest PM2.5 of that year - 2017 has peak readings during last two months of year - 2018 has peaks around JUL and from DEC-FEB - 2019 peaks around OCT, but has noticeably less variation in spikes compared to previous years - 2020 has the biggest range, with the lowest of the 5 year readings occurring MAR2020 and the highest occuring SEP2020 - 2020 has a general trend upwards as year progresses (random walk?) majority of PM2.5 readings occur between 50-100

Let’s create a time series object for the daily PM2.5 data.

PM2.5 <- ts(
  airquality$PM2.5.AQI.Value,
  start = c(2016, 1), # the series starts on the 1st day of 2016
  frequency = 365.25 # number of days per year, accounting for leap years 
)
PM2.5
## Time Series:
## Start = 2016 
## End = 2020.9993155373 
## Frequency = 365.25 
##    [1]  91  66  75  48  39  68  40  58  56  59  62  55  66  76  60  64  74  60
##   [19]  45  53  55  60  64  51  62  67  75  62  86  68  22  43  43  53  53  92
##   [37]  50  70  54  47  53  59  64 117  86  58  46  55  39  53  58  68  76  64
##   [55]  37  64  68  64  73 102  98  83  67  52  37  25  25  30  51  65  58  54
##   [73]  51  26  47  73  63  60  56  61  51  45  53  49  50  58  66  62  33  30
##   [91]  40  61  65  62  63  61  72  62  52  40  20  39  39  51  59  58  40  44
##  [109]  49  59  50  54  29  35  60  56  56  61  52  47  40  55  58  63  57  36
##  [127]  29  30  25  36  31  45  57  56  50  33  35  38  51  58  45  27  33  36
##  [145]  25  20  46  52  54  38  36  51  63  65  63  65  56  46  43  51  54  51
##  [163]  45  40  42  51  58  59  80  97  81  74  67  66  58  67  69  66  68  57
##  [181]  61  71  66  54  52  53  47  41  52  53  51  54  62  64  60  64  62  56
##  [199]  54  51  52  58  65  74  68  65  68  60  53  53  62  62  57  52  41  49
##  [217]  56  58  57  56  63  63  61  58  52  52  59  64  58  63  59  53  55  55
##  [235]  56  56  60  60  57  53  58  65  69  68  65  61  59  59  62  62  65  63
##  [253]  62  62  59  52  39  43  51  58  59  74  70  62  60  64  74  68  61  54
##  [271]  57  54  53  56  60  58  54  49  46  51  51  55  51  52  39  41  48  55
##  [289]  53  26  23  35  63  52  48  54  61  68  51  60  66  60  56  41  38  37
##  [307]  49  58  63  64  61  62  66  48  46  46  56  57  55  70  63  53  56  59
##  [325]  56  44  41  59  72  74  70  49  58  53  59  61  56  58  72  70  66  62
##  [343]  69  73  68  54  50  63  93  86  44  62  90  91  74  50  36  43  42  76
##  [361]  73  69  74  61  63  46  54  61  54  55  47  54  67  63  73  54  49  54
##  [379]  48  70  66  64  64  66  54  46  56  52  34  39  56  65  64  73  64  52
##  [397]  63  78  63  70  37  44  48  33  38  52  55  51  45  49  57  50  66  75
##  [415]  32  39  38  39  38  33  26  24  17  12  62  34  33  32  45  81  42  42
##  [433]  47  58  53  58  63  60  52  55  56  55  60  48  46  18  23  28  32  27
##  [451]  28  32  38  39  60  58  54  52  35  27  46  59  53  38  29  40  51  38
##  [469]  30  30  54  62  27  18  16  68  63  72  66  50  43  43  46  60  38  41
##  [487]  51  73  63  56  53  34  23  35  46  30  42  41  47  48  37  40  35  30
##  [505]  43  49  46  38  43  38  32  18  14  27  46  51  47  30  49  53  43  46
##  [523]  41  39  40  30  21  26  32  45  53  56  57  51  43  45  48  44  45  42
##  [541]  41  62  75  72  46  44  60  62  55  51  51  52  51  57  70  78  63  52
##  [559]  35  51  37  53  60  56  54  48  51  53  54  53  40  34  30  40  51  57
##  [577]  57  61  56  53  48  54  58  62  43  62  71  59  62  66  67  57  38  44
##  [595]  54  55  52  44  49  53  55  60  67  67  60  61  80  88  76  72  64  41
##  [613]  64  59  64  63  58  61  58  58  52  53  38  41  52  61  59  52  51  45
##  [631]  40  29  47  48  53  53  54  57  62  54  48  58  60  62  67  55  68  66
##  [649]  57  66  65  69  75  71  67  72  73  81  54  47  54  56  51  54  76  65
##  [667]  73  61  40  26  26  27  37  36  31  38  56  45  28  32  44  62  74  73
##  [685]  65  68  58  58  58  66  71  60  55  63  65  76  68  48  67  82  88  74
##  [703]  71  66  47  47  44  71  53  53  48  50  56  99  57  70  67  58  73  57
##  [721]  55 106  74  88 105 103 119  88  66  77 105 139  94  94  76  78  82  67
##  [739]  68  46  40  49  76  56  51  59  64  88  72  63  68  55  52  56  51  47
##  [757]  47  63  54  33  37  59  63  71  83  64  64  70  71  62  75  63  87  66
##  [775]  51  49  41  41  41  51 143  53  50  41  37  49  52  51  43  37  40  32
##  [793]  25  24  35  21  44  26  38  61  36  34  46  32  45  45  25  18  35  49
##  [811]  47  50  38  23  34  30  38  41  63  62  55  59  62  54  58  62  58  58
##  [829]  51  71  79  59  57  61  44  53  72  57  54  43  37  56  58  60  62  68
##  [847]  65  66  66  64  63  50  57  47  61  73  74  59  56  62  64  59  54  31
##  [865]  33  45  41  32  36  46  45  38  43  38  43  22  20  35  43  54  57  33
##  [883]  36  48  61  61  65  63  58  56  56  61  59  59  54  60  60  60  53  27
##  [901]  41  57  59  62  60  57  55  56  44  49  58  63  53  29  37  42  63  64
##  [919]  70  69  63  62  62  59  56  55  55  54  52  45  53  53  39  28  40  59
##  [937]  65  70  71  64  62  57  48  43  51  57  52  53  66  60  66  59  73  53
##  [955]  46  41  52  48  51  57  66  66  74  84  89  72  93 110 142  91  83  75
##  [973]  58  59  77  74  69  62  60  39  50  61  69  62  65  67  45  57  56  53
##  [991]  52  45  40  59  70  69  69  62  56  55  69  78  65  61  57  46  52  42
## [1009]  20  54  43  55  56  54  48  42  49  30  44  43  42  41  41  45  55  53
## [1027]  62  77  73  71  81  68  79  72  69  61  56  57  67  70  60  59  67  64
## [1045]  48  64  77  48  36  48  70  71  65  68  70  67  63  46  53  56  39  44
## [1063]  58  37  46  42  54  44  56  43  39  37  55  67  64  59  64  65  67  61
## [1081]  66  57  62  77  71 112  78  82  92  92  61  54  53  53  57  47  58  40
## [1099]  59  68  68  55  44  51  61  52  80  58  53  56  47  25  52  33  48  60
## [1117]  43  48  52  44  48  46  53  66  88  74  82  61  32  31  27  26  28  38
## [1135]  50  55  45  34  56  53  31  32  30  28  33  38  50  39  38  46  61  57
## [1153]  68  60  57  38  52  28  22  41  40  28  24  29  33  45  39  58  31  24
## [1171]  45  47  64  74  41  30  27  34  40  52  66  69  70  40  61  61  62  60
## [1189]  65  33  23  19  43  61  61  69  57  59  70  56  56  56  54  59  56  69
## [1207]  56  48  64  78  79  70  59  57  61  43  30  52  71  83  82  67  35  29
## [1225]  26  37  20  34  50  69  64  64  46  50  51  42  34  43  46  47  47  45
## [1243]  40  36  54  60  69  70  56  47  49  64  56  55  57  60  76  60  58  54
## [1261]  66  63  61  55  41  47  52  47  24  34  57  56  54  58  54  59  62  60
## [1279]  63  70  75  74  69  67  69  68  65  66  71  69  75  74  73  69  64  60
## [1297]  53  46  59  61  63  61  65  66  68  69  77  76  78  72  69  59  61  63
## [1315]  75  71  68  66  60  54  53  63  66  63  61  62  48  43  68  65  64  73
## [1333]  59  66  58  58  67  64  65  63  70  60  59  60  63  59  58  62  54  54
## [1351]  62  75  75  67  65  66  53  59  55  54  62  59  60  64  72  60  45  34
## [1369]  54  55  57  59  64  59  63  67  74  68  69  73  55  59  69  64  81  70
## [1387]  74  66  64  61  64  74  58  59  65  72  87  67  75  98  65  53  73  88
## [1405]  78  83  91  89  71  81  80  91 103 102  85  72  70  60  57  58  40  50
## [1423]  57  56  56  62  71  54  25  53  51  55  70  52  42  45  53  35  40  47
## [1441]  63  64  70  74  76  45  45  45  58  62  70  77  51  48  39  32  53  60
## [1459]  58  55  80  81  64  77  64  68  63  42  59  46  46  48  54  58  62  72
## [1477]  76  87  51  61  68  57  49  57  63  74  70  89  74  58  51  37  45  37
## [1495]  48  49  42  54  63  71  84  63  42  53  58  66  60  64  74  77  70  73
## [1513]  66  60  57  61  65  53  35  31  44  49  60  38  60  57  68  65  46  26
## [1531]  21  20  27  22  29  37  35  39  22  22  26  30  26  26  19  22  21  23
## [1549]  37  47  58  60  67  62  66  69  64  47  13  15  18  24  24  30  40  37
## [1567]  59  50  54  53  28  47  54  57  62  65  75  73  60  63  74  75  69  71
## [1585]  71  68  64  74  80  77  83  77  66  62  52  36  48  57  62  67  42  32
## [1603]  53  55  64  72  74  76  73  67  57  47  33  45  62  66  75  78  53  44
## [1621]  53  59  92  84  58  46  56  57  60  60  64  58  63  60  57  57  54  54
## [1639]  48  58  57  38  46  62  64  59  62  61  67  71  85  80  67  65  64  61
## [1657]  79  70  60  53  51  52  54  62  63  61  55  42  52  62  68  65  68  71
## [1675]  72  77  73  72  69  64  59  58  62  62  61  63  71  70  65  75  72  76
## [1693]  75  74  82  88  77  60  74  79  63  68  90  59  58  59  60  65  73  94
## [1711] 105 119 104  74  92  99 138 161 172 162 163 163 124  81  83  79  83  72
## [1729]  80  77  83  82  74  87  90 104  98 115 159 170 167 162 152 110 107  82
## [1747]  69  90  88  92  91  93  83  92  85  75  78  79  79  72  53  68  88 123
## [1765] 115  79  86  81  88  92 101  85  97  57  64  60  64  71  78  83  83  72
## [1783]  59  68  81  76  93  97 104 104  98 101  93  65  67  71  74  72  70  60
## [1801]  70  83  92  94  71  97  93  98 100  88  92  76  75  83  80  76  74  78
## [1819]  92  92  81  82  92 101  74  72  82

Stationarity

# Augmented Dickey-Fuller test 
adf.test(PM2.5)
## Warning in adf.test(PM2.5): p-value smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  PM2.5
## Dickey-Fuller = -7.1044, Lag order = 12, p-value = 0.01
## alternative hypothesis: stationary

The p-value is 0.01, so we can reject the null hypothesis that the time series has a unit root in favor of the alternative hypothesis that the time series is stationary.

Decomposition

# Decompose time series into trend, seasonality, and noise 
PM2.5_comps <- decompose(PM2.5)
plot(PM2.5_comps)

The ADF test suggested the series is stationary; however, the decomposition reveals an upward trend and annual seasonality.

Autocorrelation

# ACF and PACF plots (lag=365 selected to represent a year)
acf2(PM2.5, 365.25, main='ACF & PACF')

##      [,1]  [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13]
## ACF  0.78  0.61 0.50 0.43 0.41 0.39 0.37 0.37 0.35  0.34  0.32  0.31  0.29
## PACF 0.78 -0.01 0.06 0.06 0.10 0.04 0.06 0.07 0.00  0.04  0.01  0.04  0.00
##      [,14] [,15] [,16] [,17] [,18] [,19] [,20] [,21] [,22] [,23] [,24] [,25]
## ACF   0.30  0.30  0.31  0.31  0.31  0.32  0.31  0.30  0.30  0.30   0.3  0.29
## PACF  0.06  0.03  0.06  0.00  0.05  0.05 -0.01  0.02  0.04  0.04   0.0  0.01
##      [,26] [,27] [,28] [,29] [,30] [,31] [,32] [,33] [,34] [,35] [,36] [,37]
## ACF   0.30  0.28  0.26  0.24  0.24  0.25  0.25  0.24  0.22  0.21  0.20  0.20
## PACF  0.04 -0.02 -0.03  0.00  0.04  0.01  0.01 -0.02 -0.03  0.01  0.01  0.01
##      [,38] [,39] [,40] [,41] [,42] [,43] [,44] [,45] [,46] [,47] [,48] [,49]
## ACF   0.19  0.18  0.17  0.17  0.19  0.19  0.19  0.19  0.21  0.22  0.21   0.2
## PACF -0.03 -0.01 -0.03  0.04  0.03 -0.03  0.02  0.02  0.04  0.02 -0.02   0.0
##      [,50] [,51] [,52] [,53] [,54] [,55] [,56] [,57] [,58] [,59] [,60] [,61]
## ACF   0.19  0.16  0.15  0.13  0.12  0.11  0.11  0.11  0.12  0.13  0.13  0.15
## PACF -0.02 -0.03 -0.02 -0.02 -0.01 -0.02  0.01 -0.01  0.03  0.01  0.01  0.04
##      [,62] [,63] [,64] [,65] [,66] [,67] [,68] [,69] [,70] [,71] [,72] [,73]
## ACF   0.14  0.14  0.15  0.17  0.17  0.17  0.16  0.15  0.15  0.14  0.15  0.14
## PACF -0.02  0.00  0.06  0.01  0.00  0.04 -0.05  0.01  0.01  0.01  0.01 -0.04
##      [,74] [,75] [,76] [,77] [,78] [,79] [,80] [,81] [,82] [,83] [,84] [,85]
## ACF   0.14  0.13  0.12  0.12  0.12  0.12  0.13  0.12  0.10  0.08  0.09  0.09
## PACF  0.04 -0.02 -0.01  0.00  0.03 -0.02  0.02 -0.02 -0.03 -0.04  0.04  0.00
##      [,86] [,87] [,88] [,89] [,90] [,91] [,92] [,93] [,94] [,95] [,96] [,97]
## ACF   0.09  0.09  0.10   0.1  0.09  0.09  0.09  0.09  0.10  0.10  0.09  0.08
## PACF -0.02  0.01  0.01   0.0 -0.04  0.00  0.00  0.00  0.03 -0.03 -0.01 -0.02
##      [,98] [,99] [,100] [,101] [,102] [,103] [,104] [,105] [,106] [,107] [,108]
## ACF   0.07  0.07   0.07   0.05   0.04   0.06   0.07   0.08   0.08   0.07   0.07
## PACF  0.01  0.01  -0.02  -0.01  -0.01   0.04   0.01   0.03  -0.01  -0.03   0.02
##      [,109] [,110] [,111] [,112] [,113] [,114] [,115] [,116] [,117] [,118]
## ACF    0.07   0.07   0.08   0.08   0.08   0.06   0.04   0.04   0.04   0.05
## PACF   0.00  -0.02   0.03  -0.02  -0.01  -0.02  -0.03   0.01   0.02   0.02
##      [,119] [,120] [,121] [,122] [,123] [,124] [,125] [,126] [,127] [,128]
## ACF    0.05   0.05   0.05   0.07   0.07   0.06   0.05   0.04   0.03   0.02
## PACF  -0.01   0.00   0.01   0.04   0.00  -0.03  -0.01  -0.02  -0.03  -0.02
##      [,129] [,130] [,131] [,132] [,133] [,134] [,135] [,136] [,137] [,138]
## ACF    0.01   0.01   0.02   0.02   0.01   0.01   0.01   0.01  -0.01  -0.02
## PACF   0.00   0.00  -0.02   0.02  -0.02   0.01  -0.01  -0.02  -0.03  -0.02
##      [,139] [,140] [,141] [,142] [,143] [,144] [,145] [,146] [,147] [,148]
## ACF   -0.02  -0.02  -0.01  -0.02  -0.03  -0.04  -0.04  -0.04  -0.04  -0.05
## PACF  -0.01   0.01  -0.01  -0.02  -0.02  -0.01  -0.01   0.00   0.00  -0.01
##      [,149] [,150] [,151] [,152] [,153] [,154] [,155] [,156] [,157] [,158]
## ACF   -0.06  -0.07  -0.09  -0.08  -0.08  -0.07  -0.06  -0.05  -0.05  -0.05
## PACF  -0.04  -0.02  -0.02   0.01  -0.03   0.02  -0.01   0.03   0.00  -0.01
##      [,159] [,160] [,161] [,162] [,163] [,164] [,165] [,166] [,167] [,168]
## ACF   -0.06  -0.08  -0.08  -0.08  -0.07  -0.05  -0.04  -0.02  -0.03  -0.05
## PACF  -0.02  -0.03   0.02   0.01   0.02   0.03   0.02   0.02  -0.02  -0.02
##      [,169] [,170] [,171] [,172] [,173] [,174] [,175] [,176] [,177] [,178]
## ACF   -0.05  -0.05  -0.07  -0.08  -0.10  -0.11  -0.11  -0.11  -0.11  -0.10
## PACF   0.02   0.00  -0.04  -0.04  -0.03   0.00   0.00   0.01  -0.01   0.01
##      [,179] [,180] [,181] [,182] [,183] [,184] [,185] [,186] [,187] [,188]
## ACF   -0.11  -0.11  -0.11  -0.10  -0.09  -0.07  -0.08  -0.09  -0.10  -0.08
## PACF  -0.03   0.00  -0.01   0.03  -0.01   0.02  -0.04   0.00  -0.02   0.04
##      [,189] [,190] [,191] [,192] [,193] [,194] [,195] [,196] [,197] [,198]
## ACF   -0.08  -0.07  -0.08  -0.08  -0.08  -0.07  -0.06  -0.07  -0.08  -0.09
## PACF  -0.03   0.01  -0.01  -0.02   0.01   0.05   0.01  -0.04   0.00  -0.01
##      [,199] [,200] [,201] [,202] [,203] [,204] [,205] [,206] [,207] [,208]
## ACF   -0.09  -0.08  -0.06  -0.04  -0.04  -0.03  -0.04  -0.06  -0.08  -0.08
## PACF   0.01   0.02   0.04   0.00   0.01   0.02   0.01  -0.05  -0.02   0.00
##      [,209] [,210] [,211] [,212] [,213] [,214] [,215] [,216] [,217] [,218]
## ACF   -0.08  -0.07  -0.06  -0.05  -0.04  -0.03  -0.04  -0.05  -0.05  -0.07
## PACF   0.01   0.01   0.02  -0.01   0.04   0.00  -0.01  -0.02   0.00  -0.01
##      [,219] [,220] [,221] [,222] [,223] [,224] [,225] [,226] [,227] [,228]
## ACF   -0.08  -0.07  -0.07  -0.08  -0.07  -0.06  -0.06  -0.05  -0.05  -0.05
## PACF   0.00   0.01  -0.01  -0.02   0.03   0.03  -0.02   0.01  -0.01   0.00
##      [,229] [,230] [,231] [,232] [,233] [,234] [,235] [,236] [,237] [,238]
## ACF   -0.04  -0.04  -0.02  -0.01  -0.01  -0.01  -0.02  -0.02  -0.01  -0.02
## PACF   0.03  -0.02   0.03   0.01   0.01  -0.01   0.00  -0.01   0.02  -0.02
##      [,239] [,240] [,241] [,242] [,243] [,244] [,245] [,246] [,247] [,248]
## ACF   -0.04  -0.04  -0.04  -0.03  -0.03  -0.03  -0.04  -0.04  -0.05  -0.06
## PACF  -0.01   0.01   0.00   0.02   0.00  -0.01  -0.03   0.00  -0.02  -0.02
##      [,249] [,250] [,251] [,252] [,253] [,254] [,255] [,256] [,257] [,258]
## ACF   -0.07  -0.06  -0.05  -0.04  -0.03  -0.02  -0.02  -0.03  -0.04  -0.03
## PACF  -0.02   0.04  -0.01   0.01   0.04   0.01   0.01  -0.02   0.01   0.01
##      [,259] [,260] [,261] [,262] [,263] [,264] [,265] [,266] [,267] [,268]
## ACF   -0.02  -0.03  -0.04  -0.05  -0.04  -0.03  -0.02  -0.01  -0.01  -0.02
## PACF   0.00  -0.03  -0.02  -0.01   0.03   0.01   0.03   0.01   0.00  -0.02
##      [,269] [,270] [,271] [,272] [,273] [,274] [,275] [,276] [,277] [,278]
## ACF   -0.01  -0.01  -0.04  -0.05  -0.06  -0.04  -0.02   0.00   0.00   0.01
## PACF   0.04  -0.03  -0.04  -0.01   0.01   0.02   0.00   0.04  -0.02   0.02
##      [,279] [,280] [,281] [,282] [,283] [,284] [,285] [,286] [,287] [,288]
## ACF    0.01   0.00  -0.01  -0.02  -0.02  -0.03  -0.02  -0.01  -0.01      0
## PACF   0.00   0.01  -0.03  -0.01  -0.01  -0.01   0.01   0.04   0.00      0
##      [,289] [,290] [,291] [,292] [,293] [,294] [,295] [,296] [,297] [,298]
## ACF    0.01   0.01   0.02   0.03   0.03   0.03   0.03   0.03   0.03   0.02
## PACF   0.00   0.01   0.01   0.03   0.00   0.01   0.01  -0.03   0.05  -0.04
##      [,299] [,300] [,301] [,302] [,303] [,304] [,305] [,306] [,307] [,308]
## ACF    0.02   0.04   0.06   0.08   0.09   0.09   0.09   0.09   0.10   0.10
## PACF   0.02   0.04   0.00   0.04   0.03   0.00   0.04  -0.01   0.02   0.01
##      [,309] [,310] [,311] [,312] [,313] [,314] [,315] [,316] [,317] [,318]
## ACF    0.10   0.08   0.08   0.08   0.09   0.10    0.1   0.09   0.08   0.09
## PACF   0.01  -0.03  -0.01   0.00   0.02   0.04    0.0  -0.03   0.00   0.03
##      [,319] [,320] [,321] [,322] [,323] [,324] [,325] [,326] [,327] [,328]
## ACF    0.09   0.08   0.08   0.09    0.1   0.10   0.10   0.10    0.1   0.10
## PACF  -0.02   0.00  -0.01   0.00    0.0  -0.02  -0.03   0.03    0.0  -0.01
##      [,329] [,330] [,331] [,332] [,333] [,334] [,335] [,336] [,337] [,338]
## ACF    0.08   0.07   0.07   0.08   0.09    0.1   0.10   0.11   0.12   0.13
## PACF  -0.04   0.00   0.00   0.03   0.00    0.0  -0.01   0.03   0.02   0.01
##      [,339] [,340] [,341] [,342] [,343] [,344] [,345] [,346] [,347] [,348]
## ACF    0.11   0.11   0.11   0.11    0.1   0.10   0.09   0.09   0.09   0.09
## PACF  -0.03   0.01   0.00   0.02    0.0  -0.01  -0.02   0.03  -0.02  -0.01
##      [,349] [,350] [,351] [,352] [,353] [,354] [,355] [,356] [,357] [,358]
## ACF    0.09   0.09   0.11   0.11   0.11   0.14   0.13   0.13   0.12   0.12
## PACF   0.01   0.00   0.05  -0.01   0.02   0.05  -0.02   0.00   0.01  -0.01
##      [,359] [,360] [,361] [,362] [,363] [,364] [,365]
## ACF    0.13   0.12   0.13   0.13   0.12   0.11   0.11
## PACF   0.02  -0.02   0.02   0.02  -0.03   0.00   0.01

The autocorrelation at lag 1 is very strong. The ACF tails off, while the PACF cuts off after lag 1, which suggests an AR(1) model. Let’s try it out.

Modeling

Autoregression

# AR(1) model 
sarima(PM2.5, p=1, d=0, q=0)
## initial  value 2.922180 
## iter   2 value 2.443094
## iter   3 value 2.443094
## iter   4 value 2.443094
## iter   4 value 2.443094
## iter   4 value 2.443094
## final  value 2.443094 
## converged
## initial  value 2.443928 
## iter   2 value 2.443927
## iter   3 value 2.443926
## iter   3 value 2.443926
## iter   3 value 2.443926
## final  value 2.443926 
## converged

## $fit
## 
## Call:
## arima(x = xdata, order = c(p, d, q), seasonal = list(order = c(P, D, Q), period = S), 
##     xreg = xmean, include.mean = FALSE, transform.pars = trans, fixed = fixed, 
##     optim.control = list(trace = trc, REPORT = 1, reltol = tol))
## 
## Coefficients:
##          ar1    xmean
##       0.7857  58.3502
## s.e.  0.0145   1.2548
## 
## sigma^2 estimated as 132.6:  log likelihood = -7057.45,  aic = 14120.91
## 
## $degrees_of_freedom
## [1] 1825
## 
## $ttable
##       Estimate     SE t.value p.value
## ar1     0.7857 0.0145 54.2573       0
## xmean  58.3502 1.2548 46.5000       0
## 
## $AIC
## [1] 7.729014
## 
## $AICc
## [1] 7.729018
## 
## $BIC
## [1] 7.738062

There are some problems with this model. The tail ends of the residuals deviate quite significantly from the normal distribution, according to the Q-Q plot. Furthermore, the p values of the Ljung-Box statistic are all under .05, which suggests autocorrelation within the residuals.

Let’s try taking the log of the series first to see if that will help normalize the residuals.

sarima(log(PM2.5), 1, 0, 0)
## initial  value -1.110872 
## iter   2 value -1.529028
## iter   3 value -1.529028
## iter   4 value -1.529028
## iter   4 value -1.529028
## iter   4 value -1.529028
## final  value -1.529028 
## converged
## initial  value -1.528453 
## iter   2 value -1.528453
## iter   3 value -1.528454
## iter   4 value -1.528454
## iter   5 value -1.528454
## iter   6 value -1.528454
## iter   7 value -1.528455
## iter   8 value -1.528455
## iter   8 value -1.528455
## iter   8 value -1.528455
## final  value -1.528455 
## converged

## $fit
## 
## Call:
## arima(x = xdata, order = c(p, d, q), seasonal = list(order = c(P, D, Q), period = S), 
##     xreg = xmean, include.mean = FALSE, transform.pars = trans, fixed = fixed, 
##     optim.control = list(trace = trc, REPORT = 1, reltol = tol))
## 
## Coefficients:
##          ar1   xmean
##       0.7531  4.0161
## s.e.  0.0154  0.0205
## 
## sigma^2 estimated as 0.04701:  log likelihood = 200.09,  aic = -394.17
## 
## $degrees_of_freedom
## [1] 1825
## 
## $ttable
##       Estimate     SE  t.value p.value
## ar1     0.7531 0.0154  48.9410       0
## xmean   4.0161 0.0205 195.7841       0
## 
## $AIC
## [1] -0.2157482
## 
## $AICc
## [1] -0.2157446
## 
## $BIC
## [1] -0.2066998

Taking the log actually made the residuals even less normal. Let’s try taking the first difference.

sarima(PM2.5, 1, 1, 0)
## initial  value 2.499247 
## iter   2 value 2.494233
## iter   3 value 2.494232
## iter   4 value 2.494232
## iter   5 value 2.494232
## iter   6 value 2.494232
## iter   6 value 2.494232
## final  value 2.494232 
## converged
## initial  value 2.495116 
## iter   2 value 2.495116
## iter   3 value 2.495115
## iter   4 value 2.495115
## iter   5 value 2.495115
## iter   5 value 2.495115
## iter   5 value 2.495115
## final  value 2.495115 
## converged

## $fit
## 
## Call:
## arima(x = xdata, order = c(p, d, q), seasonal = list(order = c(P, D, Q), period = S), 
##     xreg = constant, transform.pars = trans, fixed = fixed, optim.control = list(trace = trc, 
##         REPORT = 1, reltol = tol))
## 
## Coefficients:
##           ar1  constant
##       -0.1000   -0.0044
## s.e.   0.0233    0.2579
## 
## sigma^2 estimated as 147:  log likelihood = -7147.06,  aic = 14300.12
## 
## $degrees_of_freedom
## [1] 1824
## 
## $ttable
##          Estimate     SE t.value p.value
## ar1       -0.1000 0.0233 -4.2883  0.0000
## constant  -0.0044 0.2579 -0.0169  0.9865
## 
## $AIC
## [1] 7.831393
## 
## $AICc
## [1] 7.831397
## 
## $BIC
## [1] 7.840446

That didn’t solve the problem either. Let’s try taking both the log and the first difference.

sarima(log(PM2.5), 1, 1, 0)
## initial  value -1.463196 
## iter   2 value -1.468145
## iter   3 value -1.468145
## iter   4 value -1.468145
## iter   5 value -1.468145
## iter   5 value -1.468145
## iter   5 value -1.468145
## final  value -1.468145 
## converged
## initial  value -1.467889 
## iter   2 value -1.467889
## iter   3 value -1.467889
## iter   4 value -1.467889
## iter   4 value -1.467889
## iter   4 value -1.467889
## final  value -1.467889 
## converged

## $fit
## 
## Call:
## arima(x = xdata, order = c(p, d, q), seasonal = list(order = c(P, D, Q), period = S), 
##     xreg = constant, transform.pars = trans, fixed = fixed, optim.control = list(trace = trc, 
##         REPORT = 1, reltol = tol))
## 
## Coefficients:
##           ar1  constant
##       -0.0993    0.0000
## s.e.   0.0233    0.0049
## 
## sigma^2 estimated as 0.05309:  log likelihood = 89.38,  aic = -172.77
## 
## $degrees_of_freedom
## [1] 1824
## 
## $ttable
##          Estimate     SE t.value p.value
## ar1       -0.0993 0.0233 -4.2604   0.000
## constant   0.0000 0.0049 -0.0101   0.992
## 
## $AIC
## [1] -0.09461512
## 
## $AICc
## [1] -0.09461151
## 
## $BIC
## [1] -0.08556273

No luck.

Instead of trying to normalize the residuals and remove their autocorrelation, we can search for the optimal lag length using AIC, which balances minimizing model variance with minimizing model complexity.

VARselect(PM2.5,lag.max=50, type="const")
## $selection
## AIC(n)  HQ(n)  SC(n) FPE(n) 
##     19      8      5     19 
## 
## $criteria
##                 1          2          3          4          5          6
## AIC(n)   4.853240   4.853845   4.849992   4.846828   4.837763   4.838416
## HQ(n)    4.855519   4.857264   4.854551   4.852526   4.844601   4.846393
## SC(n)    4.859410   4.863101   4.862334   4.862254   4.856275   4.860013
## FPE(n) 128.154893 128.232468 127.739388 127.335795 126.186767 126.269135
##                 7          8          9         10         11         12
## AIC(n)   4.835665   4.831970   4.833095   4.831457   4.832505   4.832505
## HQ(n)    4.844782   4.842227   4.844492   4.843993   4.846181   4.847321
## SC(n)    4.860348   4.859739   4.863949   4.865396   4.869530   4.872615
## FPE(n) 125.922316 125.457923 125.599152 125.393508 125.525091 125.525094
##                13         14         15         16         17         18
## AIC(n)   4.833547   4.831359   4.832134   4.828167   4.829220   4.828472
## HQ(n)    4.849502   4.848454   4.850369   4.847542   4.849734   4.850126
## SC(n)    4.876742   4.877640   4.881500   4.880618   4.884757   4.887094
## FPE(n) 125.655901 125.381344 125.478559 124.981762 125.113461 125.019893
##                19         20         21         22         23         24
## AIC(n)   4.827284   4.828399   4.829428   4.828070   4.827610   4.828720
## HQ(n)    4.850077   4.852332   4.854501   4.854282   4.854962   4.857211
## SC(n)    4.888991   4.893192   4.897306   4.899033   4.901658   4.905854
## FPE(n) 124.871437 125.010834 125.139559 124.969683 124.912216 125.050988
##                25         26         27         28         29         30
## AIC(n)   4.829546   4.829714   4.830460   4.830884   4.831899   4.831614
## HQ(n)    4.859177   4.860485   4.862371   4.863934   4.866089   4.866944
## SC(n)    4.909765   4.913018   4.916850   4.920360   4.924460   4.927260
## FPE(n) 125.154366 125.175423 125.268905 125.322081 125.449402 125.413669
##                31         32         33         34         35         36
## AIC(n)   4.832428   4.833457   4.834124   4.834648   4.835763   4.836643
## HQ(n)    4.868897   4.871066   4.872873   4.874536   4.876791   4.878811
## SC(n)    4.931159   4.935274   4.939027   4.942635   4.946836   4.950801
## FPE(n) 125.515802 125.645106 125.729045 125.794914 125.935375 126.046281
##                37         38         39         40         41         42
## AIC(n)   4.837752   4.838348   4.839445   4.839697   4.839649   4.839714
## HQ(n)    4.881060   4.882795   4.885032   4.886423   4.887515   4.888719
## SC(n)    4.954996   4.958677   4.962860   4.966197   4.969234   4.972384
## FPE(n) 126.186232 126.261519 126.400199 126.432084 126.426092 126.434365
##                43         44         45         46         47         48
## AIC(n)   4.839959   4.840195   4.840604   4.839504   4.840309   4.840941
## HQ(n)    4.890104   4.891480   4.893028   4.893068   4.895013   4.896784
## SC(n)    4.975715   4.979037   4.982530   4.984516   4.988407   4.992123
## FPE(n) 126.465438 126.495438 126.547189 126.408183 126.510160 126.590141
##                49         50
## AIC(n)   4.842060   4.842598
## HQ(n)    4.899044   4.900721
## SC(n)    4.996328   4.999951
## FPE(n) 126.732083 126.800341

The optimal lag length is 19 based on AIC. Let’s try it out.

sarima(PM2.5, 19, 0, 0)
## initial  value 2.925458 
## iter   2 value 2.791594
## iter   3 value 2.637992
## iter   4 value 2.598467
## iter   5 value 2.494718
## iter   6 value 2.460918
## iter   7 value 2.459083
## iter   8 value 2.422684
## iter   9 value 2.422513
## iter  10 value 2.419214
## iter  11 value 2.419034
## iter  12 value 2.418380
## iter  13 value 2.418324
## iter  14 value 2.418286
## iter  15 value 2.418276
## iter  16 value 2.418275
## iter  17 value 2.418274
## iter  18 value 2.418274
## iter  19 value 2.418274
## iter  20 value 2.418274
## iter  21 value 2.418274
## iter  22 value 2.418274
## iter  23 value 2.418274
## iter  24 value 2.418273
## iter  25 value 2.418272
## iter  26 value 2.418270
## iter  27 value 2.418269
## iter  28 value 2.418268
## iter  29 value 2.418268
## iter  29 value 2.418268
## iter  29 value 2.418268
## final  value 2.418268 
## converged
## initial  value 2.421778 
## iter   2 value 2.421711
## iter   3 value 2.421708
## iter   4 value 2.421707
## iter   5 value 2.421705
## iter   6 value 2.421704
## iter   7 value 2.421704
## iter   7 value 2.421704
## iter   7 value 2.421704
## final  value 2.421704 
## converged

## $fit
## 
## Call:
## arima(x = xdata, order = c(p, d, q), seasonal = list(order = c(P, D, Q), period = S), 
##     xreg = xmean, include.mean = FALSE, transform.pars = trans, fixed = fixed, 
##     optim.control = list(trace = trc, REPORT = 1, reltol = tol))
## 
## Coefficients:
##          ar1      ar2     ar3      ar4     ar5      ar6      ar7     ar8
##       0.7652  -0.0678  0.0176  -0.0265  0.0706  -0.0041  -0.0084  0.0737
## s.e.  0.0234   0.0295  0.0296   0.0296  0.0296   0.0297   0.0297  0.0297
##           ar9    ar10     ar11    ar12     ar13    ar14     ar15    ar16
##       -0.0390  0.0363  -0.0249  0.0412  -0.0507  0.0473  -0.0203  0.0599
## s.e.   0.0297  0.0297   0.0297  0.0297   0.0297  0.0297   0.0297  0.0297
##          ar17    ar18    ar19    xmean
##       -0.0269  0.0100  0.0473  58.7594
## s.e.   0.0298  0.0297  0.0235   2.6086
## 
## sigma^2 estimated as 126.8:  log likelihood = -7016.85,  aic = 14075.71
## 
## $degrees_of_freedom
## [1] 1807
## 
## $ttable
##       Estimate     SE t.value p.value
## ar1     0.7652 0.0234 32.7088  0.0000
## ar2    -0.0678 0.0295 -2.3000  0.0216
## ar3     0.0176 0.0296  0.5956  0.5515
## ar4    -0.0265 0.0296 -0.8963  0.3702
## ar5     0.0706 0.0296  2.3860  0.0171
## ar6    -0.0041 0.0297 -0.1378  0.8904
## ar7    -0.0084 0.0297 -0.2818  0.7781
## ar8     0.0737 0.0297  2.4826  0.0131
## ar9    -0.0390 0.0297 -1.3106  0.1902
## ar10    0.0363 0.0297  1.2206  0.2224
## ar11   -0.0249 0.0297 -0.8384  0.4019
## ar12    0.0412 0.0297  1.3894  0.1649
## ar13   -0.0507 0.0297 -1.7079  0.0878
## ar14    0.0473 0.0297  1.5905  0.1119
## ar15   -0.0203 0.0297 -0.6845  0.4937
## ar16    0.0599 0.0297  2.0160  0.0439
## ar17   -0.0269 0.0298 -0.9035  0.3664
## ar18    0.0100 0.0297  0.3369  0.7362
## ar19    0.0473 0.0235  2.0105  0.0445
## xmean  58.7594 2.6086 22.5250  0.0000
## 
## $AIC
## [1] 7.704274
## 
## $AICc
## [1] 7.704529
## 
## $BIC
## [1] 7.767612

The diagnostic plots for AR(19) show that the residuals have no significant autocorrelations, and all Ljung-Box statistics are larger than the threshold p-value. The tails of the residuals distribution do however still deviate from normality. This model has an AIC of 14075.71.

Seasonality-Removed Autoregression

The decomposition of the time series suggests that there is yearly seasonality. We can manually examine yearly lagged PM2.5 obtained by taking a difference with 365th lagged term.

PM2.5_yearly_lagged <- diff(PM2.5, lag=365) 
tsplot(PM2.5_yearly_lagged, main="Seasonality-Removed PM2.5", ylab="")

acf2(PM2.5_yearly_lagged, max.lag=100, main="Autocorrelation of Seasonality-Removed PM2.5")

##      [,1]  [,2] [,3] [,4] [,5] [,6] [,7] [,8]  [,9] [,10] [,11] [,12] [,13]
## ACF  0.72  0.49 0.35 0.27 0.24 0.23 0.20 0.19  0.15  0.14  0.13  0.14  0.12
## PACF 0.72 -0.04 0.02 0.04 0.07 0.03 0.01 0.04 -0.03  0.05 -0.01  0.05 -0.03
##      [,14] [,15] [,16] [,17] [,18] [,19] [,20] [,21] [,22] [,23] [,24] [,25]
## ACF   0.13  0.14  0.15  0.14  0.12  0.13  0.11  0.08  0.08  0.10  0.09  0.10
## PACF  0.06  0.03  0.02  0.00  0.00  0.05 -0.04 -0.01  0.03  0.03 -0.02  0.04
##      [,26] [,27] [,28] [,29] [,30] [,31] [,32] [,33] [,34] [,35] [,36] [,37]
## ACF   0.12  0.10  0.07  0.07  0.10  0.12  0.13  0.13  0.10  0.07  0.06  0.04
## PACF  0.04 -0.04 -0.03  0.04  0.08 -0.02  0.05 -0.01 -0.03 -0.01  0.00 -0.02
##      [,38] [,39] [,40] [,41] [,42] [,43] [,44] [,45] [,46] [,47] [,48] [,49]
## ACF   0.04  0.04  0.03  0.04  0.06  0.06  0.06  0.08  0.10  0.11  0.10  0.10
## PACF  0.00 -0.01 -0.01  0.03  0.02 -0.02  0.03  0.03  0.03  0.02 -0.01 -0.01
##      [,50] [,51] [,52] [,53] [,54] [,55] [,56] [,57] [,58] [,59] [,60] [,61]
## ACF   0.06  0.03  0.00 -0.01 -0.01 -0.02 -0.02 -0.01  0.00  0.00 -0.01  0.01
## PACF -0.05 -0.02 -0.03 -0.01  0.00 -0.03  0.00  0.01  0.01 -0.01 -0.02  0.05
##      [,62] [,63] [,64] [,65] [,66] [,67] [,68] [,69] [,70] [,71] [,72] [,73]
## ACF   0.00 -0.01  0.04  0.07  0.07  0.08  0.05  0.05  0.05  0.05  0.07  0.07
## PACF -0.06 -0.01  0.10  0.01  0.00  0.03 -0.03  0.03  0.00  0.02  0.01  0.00
##      [,74] [,75] [,76] [,77] [,78] [,79] [,80] [,81] [,82] [,83] [,84] [,85]
## ACF   0.09  0.09  0.08  0.09  0.13  0.13  0.15  0.14  0.10  0.06  0.06  0.05
## PACF  0.06  0.01 -0.02  0.04  0.09 -0.03  0.05 -0.02 -0.02 -0.04  0.04  0.00
##      [,86] [,87] [,88] [,89] [,90] [,91] [,92] [,93] [,94] [,95] [,96] [,97]
## ACF   0.06  0.08  0.10  0.08  0.08  0.09  0.10  0.10  0.10  0.06  0.05  0.06
## PACF  0.01  0.03  0.04 -0.03 -0.02  0.04  0.03 -0.01 -0.01 -0.05  0.01  0.00
##      [,98] [,99] [,100]
## ACF   0.05  0.05   0.06
## PACF -0.02  0.02   0.01

The ACF and PACF suggest that the the seasonality-removed PM2.5 still follows an AR(p) of some order p. Let’s use AIC to find the optimal p. 

VARselect(PM2.5_yearly_lagged, lag.max=50, type="const")
## $selection
## AIC(n)  HQ(n)  SC(n) FPE(n) 
##      5      2      1      5 
## 
## $criteria
##                 1          2         3          4          5          6
## AIC(n)   5.507026   5.505033   5.50569   5.504896   5.501098   5.501893
## HQ(n)    5.509806   5.509203   5.51125   5.511847   5.509438   5.511623
## SC(n)    5.514466   5.516193   5.52057   5.523497   5.523418   5.527933
## FPE(n) 246.417206 245.926472 246.08816 245.892986 244.960704 245.155472
##                 7          8          9         10         11         12
## AIC(n)   5.503108   5.503372   5.504377   5.503287   5.504535   5.503202
## HQ(n)    5.514228   5.515882   5.518277   5.518578   5.521215   5.521273
## SC(n)    5.532868   5.536852   5.541578   5.544208   5.549176   5.551563
## FPE(n) 245.453566 245.518351 245.765290 245.497612 245.804158 245.476784
##                13         14         15         16         17         18
## AIC(n)   5.503820   5.502877   5.503125   5.503718   5.505112   5.506405
## HQ(n)    5.523281   5.523727   5.525366   5.527349   5.530133   5.532817
## SC(n)    5.555902   5.558678   5.562646   5.566959   5.572074   5.577087
## FPE(n) 245.628679 245.397018 245.457982 245.603565 245.946329 246.264727
##                19         20         21         22         23         24
## AIC(n)   5.504705   5.504264   5.505461   5.505891   5.505974   5.506978
## HQ(n)    5.532506   5.533455   5.536042   5.537862   5.539336   5.541730
## SC(n)    5.579107   5.582386   5.587303   5.591453   5.595256   5.599981
## FPE(n) 245.846341 245.738092 246.032492 246.138262 246.158953 246.406367
##                25         26         27         28         29         30
## AIC(n)   5.506154   5.505844   5.505087   5.505517   5.504818   5.501505
## HQ(n)    5.542296   5.543375   5.544008   5.545828   5.546519   5.544597
## SC(n)    5.602876   5.606286   5.609249   5.613399   5.616420   5.616827
## FPE(n) 246.203434 246.127147 245.941031 246.046931 245.875190 245.062147
##                31         32         33         34         35         36
## AIC(n)   5.502876   5.501644   5.502457   5.503487   5.504453   5.505799
## HQ(n)    5.547358   5.547516   5.549720   5.552139   5.554495   5.557231
## SC(n)    5.621919   5.624407   5.628940   5.633690   5.638376   5.643442
## FPE(n) 245.398609 245.096542 245.296276 245.549126 245.786742 246.117926
##                37         38         39         40         41         42
## AIC(n)   5.506504   5.507888   5.509213   5.510569   5.510989   5.511897
## HQ(n)    5.559327   5.562100   5.564815   5.567562   5.569372   5.571670
## SC(n)    5.647868   5.652971   5.658016   5.663093   5.667233   5.671861
## FPE(n) 246.291881 246.633179 246.960379 247.295956 247.400136 247.625161
##                43         44         45         46         47         48
## AIC(n)   5.512965   5.513139   5.512850   5.513669   5.514597   5.515968
## HQ(n)    5.574128   5.575692   5.576793   5.579002   5.581320   5.584081
## SC(n)    5.676649   5.680543   5.683974   5.688513   5.693161   5.698253
## FPE(n) 247.890119 247.933529 247.862198 248.065688 248.296369 248.637534
##                49         50
## AIC(n)   5.517252   5.516524
## HQ(n)    5.586755   5.587417
## SC(n)    5.703256   5.706248
## FPE(n) 248.957382 248.776517

The AIC tells us that the seasonality-removed series follows an AR(5) instead of an AR(19).

sarima(PM2.5_yearly_lagged, 5, 0, 0)
## initial  value 3.128618 
## iter   2 value 2.954278
## iter   3 value 2.838606
## iter   4 value 2.788928
## iter   5 value 2.764354
## iter   6 value 2.762098
## iter   7 value 2.761453
## iter   8 value 2.761391
## iter   9 value 2.761374
## iter  10 value 2.761370
## iter  11 value 2.761370
## iter  11 value 2.761370
## iter  11 value 2.761370
## final  value 2.761370 
## converged
## initial  value 2.762558 
## iter   2 value 2.762556
## iter   3 value 2.762556
## iter   3 value 2.762556
## iter   3 value 2.762556
## final  value 2.762556 
## converged

## $fit
## 
## Call:
## arima(x = xdata, order = c(p, d, q), seasonal = list(order = c(P, D, Q), period = S), 
##     xreg = xmean, include.mean = FALSE, transform.pars = trans, fixed = fixed, 
##     optim.control = list(trace = trc, REPORT = 1, reltol = tol))
## 
## Coefficients:
##          ar1      ar2      ar3      ar4     ar5   xmean
##       0.7459  -0.0544  -0.0076  -0.0139  0.0745  3.1012
## s.e.  0.0261   0.0326   0.0327   0.0327  0.0262  1.6165
## 
## sigma^2 estimated as 250.8:  log likelihood = -6113.34,  aic = 12240.69
## 
## $degrees_of_freedom
## [1] 1456
## 
## $ttable
##       Estimate     SE t.value p.value
## ar1     0.7459 0.0261 28.5649  0.0000
## ar2    -0.0544 0.0326 -1.6681  0.0955
## ar3    -0.0076 0.0327 -0.2340  0.8150
## ar4    -0.0139 0.0327 -0.4243  0.6714
## ar5     0.0745 0.0262  2.8474  0.0045
## xmean   3.1012 1.6165  1.9184  0.0553
## 
## $AIC
## [1] 8.372564
## 
## $AICc
## [1] 8.372604
## 
## $BIC
## [1] 8.397881

This model ARIMA has a AIC of 12240.69, which is the best so far.

Forecasting

pred <- sarima.for(
  PM2.5_yearly_lagged, 
  p=5, 
  d=0, 
  q=0, 
  n.ahead=7,
  main="7 Day Forecast of Year-over-Year Delta in PM2.5"
)

Because the model was trained on the 365 day differenced data, a day’s forecasted value represents its delta over that same day the previous year.